### Load required libraries
library(readxl)
library(tidyverse)
library(purrr)
library(dplyr)
library(knitr)
library(DT)
library(httr)
library(jsonlite)
library(stringr)
library(GSEABase)
### Load functions
source(file = "../02_functions.R")
### Define paths
current_dir <- getwd()
data_path <- file.path(current_dir,
"../../data/_raw")
results_pc_path <- file.path(current_dir,
"../../results/patient_characteristics/output")
results_fip_path <- file.path(current_dir,
"../../results/somatic_mutation_analysis/functional_impact_prediction/output")
results_ubgsea_path <- file.path(current_dir,
"../../results/somatic_mutation_analysis/ubiquitin_gsea/output")
# Create the folder if it does not exist
if(!file.exists(results_ubgsea_path)) {
dir.create(results_ubgsea_path, recursive = TRUE)
}
figures_ubgsea_path <- file.path(current_dir,
"../../results/somatic_mutation_analysis/ubiquitin_gsea/figures")
# Create the folder if it does not exist
if(!file.exists(figures_ubgsea_path)) {
dir.create(figures_ubgsea_path, recursive = TRUE)
}
maf_df_path <- file.path(results_fip_path, "all_mutations.csv") # All mutations df
nonsyn_df_path <- file.path(results_fip_path, "nonsyn_mutations.csv") # Nonsyn mutations df
lof_df_path <- file.path(results_fip_path, "lof_mutations_reduced.csv") # LoF mutations df
# Sup1 data
sup1_response_path <- file.path(results_pc_path,
"sup1_response.csv")
### Read data & wrangle
# Read 3 lists of mutations
# All
maf_df_annotated <- read.csv(maf_df_path, header = TRUE)[ , -1]
list_of_all_mutations <- split(maf_df_annotated, maf_df_annotated$sample_id
)
# Nonsyn
nonsyn_df_annotated <- read.csv(nonsyn_df_path, header = TRUE)[ , -1]
list_of_nonsyn_mutations <- split(nonsyn_df_annotated, nonsyn_df_annotated$sample_id)
# LoF
lof_df_annotated <- read.csv(lof_df_path, header = TRUE)[ , -1]
list_of_lof_mutations <- split(lof_df_annotated, lof_df_annotated$sample_id)
# Get the list of patient IDs
sample_id_list <- unique(maf_df_annotated$sample_id)
## Read sup1_response.csv
sup1_response_df <- read.csv(sup1_response_path)[ , -1] # Drop first column
# df to merge response info later
response_df <- sup1_response_df %>%
dplyr::rename(sample_id = Sample.ID,
patient_response = Patient.Response) %>%
dplyr::select(sample_id, patient_response)
# Create a list of Responders and Non-responders
sample_id_lists <- split(sup1_response_df$Sample.ID,
sup1_response_df$Patient.Response) # Split the "Sample.ID" values into lists based on "Patient.Response"
r_sample_ids <- sample_id_lists$R
nr_sample_ids <- sample_id_lists$NR
# Define output path
mm909_sum_plot_path <- file.path(figures_ubgsea_path,
"mm909_sum_plot.png")
# Summary plot
mm909_sum_plot <- sum_plot(all_df_ubiquitin, nonsyn_df_ubiquitin, lof_df_ubiquitin, mm909_sum_plot_path)
# Display
print(mm909_sum_plot)
Sample MM909_47 does not have any Ubiquitin-related gene.
Consists of gene sets derived from the Gene Ontology (GO) annotations. Here, we focus on Gene sets derived from the GO Biological Process (BP) ontology.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_c5bp_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_c5bp_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_c5bp_lof.csv")
# # Perform FORA analysis for C5:BP Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C5", subcategory="BP")
# Load fora results
fora_results_c5bp_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c5bp_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c5bp_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_c5bp_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_c5bp_all_plot.png")
mm909_c5bp_all_plot <- plot_top5(response_df, fora_results_c5bp_all_df, mm909_c5bp_all_plot_path, "FORA with top C5 BP across samples (All mutations)", "Top C5 BP Gene Sets")
print(mm909_c5bp_all_plot)
# Define output path
mm909_c5bp_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_c5bp_nonsyn_plot.png")
mm909_c5bp_nonsyn_plot <- plot_top5(response_df, fora_results_c5bp_nonsyn_df, mm909_c5bp_nonsyn_plot_path, "FORA with top C5 BP across samples (Nonsyn mutations)", "Top C5 BP Gene Sets")
print(mm909_c5bp_nonsyn_plot)
# Define output path
mm909_c5bp_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_c5bp_lof_plot.png")
mm909_c5bp_lof_plot <- plot_top5(response_df, fora_results_c5bp_lof_df, mm909_c5bp_lof_plot_path, "FORA with top C5 BP across samples (LoF mutations)", "Top C5 BP Gene Sets")
print(mm909_c5bp_lof_plot)
Consists of gene sets derived from the Gene Ontology (GO) annotations. Here, we focus on Gene sets derived from the GO Cellular Component (CC) ontology.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_c5cc_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_c5cc_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_c5cc_lof.csv")
# # Perform FORA analysis for H Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C5", subcategory="CC")
# Load fora results
fora_results_c5cc_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c5cc_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c5cc_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_c5cc_proteasome_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_c5cc_proteasome_lof_plot.png")
proteasome_genesets = c("GOCC_PROTEASOME_ACCESSORY_COMPLEX", "GOCC_PROTEASOME_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX_ALPHA_SUBUNIT_COMPLEX", "GOCC_PROTEASOME_CORE_COMPLEX_BETA_SUBUNIT_COMPLEX", "GOCC_PROTEASOME_REGULATORY_PARTICLE", "GOCC_PROTEASOME_REGULATORY_PARTICLE_BASE_SUBCOMPLEX", "GOCC_PROTEASOME_REGULATORY_PARTICLE_LID_SUBCOMPLEX")
mm909_c5cc_proteasome_lof_plot <- plot_geneset(proteasome_genesets, response_df, fora_results_c5cc_lof_df, mm909_c5cc_proteasome_lof_plot_path, "FORA with PROTEASOME-related C5 CC Gene Sets across samples (LoF mutations)", "PROTEASOME Gene Sets")
print(mm909_c5cc_proteasome_lof_plot)
Hallmark gene sets summarize and represent specific well-defined biological states or processes and display coherent expression.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_h_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_h_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_h_lof.csv")
# # Perform FORA analysis for H Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="H")
# Load fora results
fora_results_h_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_h_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_h_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
Not enough data
# Define output path
mm909_h_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_h_nonsyn_plot.png")
mm909_h_nonsyn_plot <- plot_top5(response_df, fora_results_h_nonsyn_df, mm909_h_nonsyn_plot_path, "FORA with top H across samples (Nonsyn mutations)", "Top H Gene Sets")
print(mm909_h_nonsyn_plot)
# Define output path
mm909_h_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_h_lof_plot.png")
mm909_h_lof_plot <- plot_top5(response_df, fora_results_h_lof_df, mm909_h_lof_plot_path, "FORA with top H across samples (LoF mutations)", "Top H Gene Sets")
print(mm909_h_lof_plot)
Filter specifically for HALLMARK_APOPTOSIS gene set.
# Define output path
mm909_h_apoptosis_plot_path <- file.path(figures_ubgsea_path,
"mm909_h_apoptosis_plot.png")
mm909_h_apoptosis_plot <- plot_geneset(c("HALLMARK_APOPTOSIS"), response_df, fora_results_h_lof_df, mm909_h_apoptosis_plot_path, "FORA with HALLMARK_APOPTOSIS across samples (LoF mutations)", "HALLMARK_APOPTOSIS Gene Set")
print(mm909_h_apoptosis_plot)
Gene sets that represent signatures of cellular pathways which are often dis-regulated in cancer. The majority of signatures were generated directly from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation of known cancer genes.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_c6_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_c6_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_c6_lof.csv")
# # Perform FORA analysis for C6 Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C6")
# Load fora results
fora_results_c6_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c6_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c6_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
Not enough data
Not enough data
Not enough data
Gene sets that represent cell states and perturbations within the immune system.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_c7_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_c7_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_c7_lof.csv")
# # Perform FORA analysis for C7 Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C7")
# Load fora results
fora_results_c7_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c7_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c7_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_c7_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_c7_all_plot.png")
mm909_c7_all_plot <- plot_top5(response_df, fora_results_c7_all_df, mm909_c7_all_plot_path, "FORA with top C7 across samples (All mutations)", "Top C7 Gene Sets")
print(mm909_c7_all_plot)
# Define output path
mm909_c7_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_c7_nonsyn_plot.png")
mm909_c7_nonsyn_plot <- plot_top5(response_df, fora_results_c7_nonsyn_df, mm909_c7_nonsyn_plot_path, "FORA with top C7 across samples (Nonsyn mutations)", "Top C7 Gene Sets")
print(mm909_c7_nonsyn_plot)
# Define output path
mm909_c7_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_c7_lof_plot.png")
mm909_c7_lof_plot <- plot_top5(response_df, fora_results_c7_lof_df, mm909_c7_lof_plot_path, "FORA with top C7 across samples (LoF mutations)", "Top C7 Gene Sets")
print(mm909_c7_lof_plot)
I searched for the term “ubiquitin” in MSigDB and downloaded a GMT file with all the hits (Gene Sets). Number of GS: 101.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_ubiquitin_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_ubiquitin_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_ubiquitin_lof.csv")
# # Load GMT file as a GeneSetCollection object
# gene_sets_collection <- getGmt(file.path(data_path, "genesets_ubiquitin.v2023.2.Hs.gmt"))
#
# # Initialize an empty list to store the results
# gene_sets_list <- list()
#
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_sets_collection)) {
# # Extract the GeneSet by its name
# geneSet <- gene_sets_collection[[setName]]
#
# # Convert Hugo symbols to Ensembl IDs for this GeneSet
# ensembl_ids <- SYMBOLtoENSEMBL(geneIds(geneSet))
#
# # Ensure the result is a character vector of Ensembl Gene IDs
# ensembl_ids_vector <- as.character(ensembl_ids)
#
# # Store the converted IDs in the list, associated with the setName
# gene_sets_list[[setName]] <- ensembl_ids_vector
# }
#
# # Set names for each element in the list to match the setName
# names(gene_sets_list) <- names(gene_sets_collection)
#
# # Perform FORA analysis for Ubiquitin Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)
# Load fora results
fora_results_ubiquitin_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_ubiquitin_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_ubiquitin_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_ubiquitin_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_ubiquitin_all_plot.png")
mm909_ubiquitin_all_plot <- plot_top5(response_df, fora_results_ubiquitin_all_df, mm909_ubiquitin_all_plot_path, "FORA with top Ubiquitin GS across samples (All mutations)", "Top Ubiquitin Gene Sets")
print(mm909_ubiquitin_all_plot)
# Define output path
mm909_ubiquitin_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_ubiquitin_nonsyn_plot.png")
mm909_ubiquitin_nonsyn_plot <- plot_top5(response_df, fora_results_ubiquitin_nonsyn_df, mm909_ubiquitin_nonsyn_plot_path, "FORA with top Ubiquitin GS across samples (Nonsyn mutations)", "Top Ubiquitin Gene Sets")
print(mm909_ubiquitin_nonsyn_plot)
# Define output path
mm909_ubiquitin_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_ubiquitin_lof_plot.png")
mm909_ubiquitin_lof_plot <- plot_top5(response_df, fora_results_ubiquitin_lof_df, mm909_ubiquitin_lof_plot_path, "FORA with top Ubiquitin GS across samples (LoF mutations)", "Top Ubiquitin Gene Sets")
print(mm909_ubiquitin_lof_plot)
Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Chemical and genetic perturbations (CGP).
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_c2cgp_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_c2cgp_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_c2cgp_lof.csv")
# # Perform FORA analysis for C2_CGP Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C2", subcategory="CGP")
# Load fora results
fora_results_c2cgp_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_c2cgp_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_c2cgp_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_c2cgp_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_c2cgp_all_plot.png")
mm909_c2cgp_all_plot <- plot_top5(response_df, fora_results_c2cgp_all_df, mm909_c2cgp_all_plot_path, "FORA with top C2:CGP across samples (All mutations)", "Top C2:CGP Gene Sets")
print(mm909_c2cgp_all_plot)
# Define output path
mm909_c2cgp_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_c2cgp_nonsyn_plot.png")
mm909_c2cgp_nonsyn_plot <- plot_top5(response_df, fora_results_c2cgp_nonsyn_df, mm909_c2cgp_nonsyn_plot_path, "FORA with top C2:CGP across samples (Nonsyn mutations)", "Top C2:CGP Gene Sets")
print(mm909_c2cgp_nonsyn_plot)
# Define output path
mm909_c2cgp_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_c2cgp_lof_plot.png")
mm909_c2cgp_lof_plot <- plot_top5(response_df, fora_results_c2cgp_lof_df, mm909_c2cgp_lof_plot_path, "FORA with top C2:CGP across samples (LoF mutations)", "Top C2:CGP Gene Sets")
print(mm909_c2cgp_lof_plot)
Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Canonical pathways (CP), specifically Canonical Pathways gene sets derived from the KEGG MEDICUS pathway database.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_keggmed_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_keggmed_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_keggmed_lof.csv")
# # Load GMT file as a GeneSetCollection object
# gene_sets_collection <- getGmt(file.path(data_path, "c2.cp.kegg_medicus.v2023.2.Hs.entrez.gmt"))
#
# # Initialize an empty list to store the results
# gene_sets_list <- list()
#
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_sets_collection)) {
# # Extract the GeneSet by its name
# geneSet <- gene_sets_collection[[setName]]
#
# # Convert Entrez IDs to Ensembl IDs for this GeneSet
# ensembl_ids <- ENTREZtoENSEMBL(geneIds(geneSet))
#
# # Ensure the result is a character vector of Ensembl Gene IDs
# ensembl_ids_vector <- as.character(ensembl_ids)
#
# # Store the converted IDs in the list, associated with the setName
# gene_sets_list[[setName]] <- ensembl_ids_vector
# }
#
# # Set names for each element in the list to match the setName
# names(gene_sets_list) <- names(gene_sets_collection)
#
# # Perform FORA analysis for KEGG_MEDICUS Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)
# Load fora results
fora_results_keggmed_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_keggmed_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_keggmed_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_keggmed_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_keggmed_all_plot.png")
mm909_keggmed_all_plot <- plot_top5(response_df, fora_results_keggmed_all_df, mm909_keggmed_all_plot_path, "FORA with top C2:CP:KEGG_MEDICUS across samples (All mutations)", "Top C2:CP:KEGG_MEDICUS Gene Sets")
print(mm909_keggmed_all_plot)
Not enough data
# Define output path
mm909_keggmed_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_keggmed_nonsyn_plot.png")
mm909_keggmed_nonsyn_plot <- plot_top5(response_df, fora_results_keggmed_nonsyn_df, mm909_keggmed_nonsyn_plot_path, "FORA with top C2:CP:KEGG_MEDICUS across samples (Nonsyn mutations)", "Top C2:CP:KEGG_MEDICUS Gene Sets")
print(mm909_keggmed_nonsyn_plot)
Not enough data
Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts. Here, we focus on Canonical pathways (CP), specifically Canonical Pathways gene sets derived from the Reactome pathway database.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_reactome_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_reactome_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_reactome_lof.csv")
# # Perform FORA analysis for C2_CGP Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, category="C2", subcategory="CP:REACTOME")
# Load fora results
fora_results_reactome_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_reactome_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_reactome_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_reactome_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_reactome_all_plot.png")
mm909_reactome_all_plot <- plot_top5(response_df, fora_results_reactome_all_df, mm909_reactome_all_plot_path, "FORA with top C2:CP:REACTOME across samples (All mutations)", "Top C2:CP:REACTOME Gene Sets")
print(mm909_reactome_all_plot)
# Define output path
mm909_reactome_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_reactome_nonsyn_plot.png")
mm909_reactome_nonsyn_plot <- plot_top5(response_df, fora_results_reactome_nonsyn_df, mm909_reactome_nonsyn_plot_path, "FORA with top C2:CP:REACTOME across samples (Nonsyn mutations)", "Top C2:CP:REACTOME Gene Sets")
print(mm909_reactome_nonsyn_plot)
# Define output path
mm909_reactome_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_reactome_lof_plot.png")
mm909_reactome_lof_plot <- plot_top5(response_df, fora_results_reactome_lof_df, mm909_reactome_lof_plot_path, "FORA with top C2:CP:REACTOME across samples (LoF mutations)", "Top C2:CP:REACTOME Gene Sets")
print(mm909_reactome_lof_plot)
I searched for the term “proteasome” in MSigDB and downloaded a GMT file with all the hits (Gene Sets). Number of GS: 56.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_proteasome_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_proteasome_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_proteasome_lof.csv")
# # Load GMT file as a GeneSetCollection object
# gene_sets_collection <- getGmt(file.path(data_path, "genesets_proteasome.v2023.2.Hs.gmt"))
#
# # Initialize an empty list to store the results
# gene_sets_list <- list()
#
# # Iterate over each GeneSet in the GeneSetCollection
# for (setName in names(gene_sets_collection)) {
# # Extract the GeneSet by its name
# geneSet <- gene_sets_collection[[setName]]
#
# # Convert Hugo Symbols to Ensembl IDs for this GeneSet
# ensembl_ids <- SYMBOLtoENSEMBL(geneIds(geneSet))
#
# # Ensure the result is a character vector of Ensembl Gene IDs
# ensembl_ids_vector <- as.character(ensembl_ids)
#
# # Store the converted IDs in the list, associated with the setName
# gene_sets_list[[setName]] <- ensembl_ids_vector
# }
#
# # Perform FORA analysis for PROTEASOME Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_mutations, list_of_nonsyn_mutations, list_of_lof_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)
# Load fora results
fora_results_proteasome_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_proteasome_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_proteasome_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Define output path
mm909_proteasome_all_plot_path <- file.path(figures_ubgsea_path,
"mm909_proteasome_all_plot.png")
mm909_proteasome_all_plot <- plot_top5(response_df, fora_results_proteasome_all_df, mm909_proteasome_all_plot_path, "FORA with top PROTEASOME GS across samples (All mutations)", "Top PROTEASOME Gene Sets")
print(mm909_proteasome_all_plot)
# Define output path
mm909_proteasome_nonsyn_plot_path <- file.path(figures_ubgsea_path,
"mm909_proteasome_nonsyn_plot.png")
mm909_proteasome_nonsyn_plot <- plot_top5(response_df, fora_results_proteasome_nonsyn_df, mm909_proteasome_nonsyn_plot_path, "FORA with top PROTEASOME GS across samples (Nonsyn mutations)", "Top PROTEASOME Gene Sets")
print(mm909_proteasome_nonsyn_plot)
# Define output path
mm909_proteasome_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_proteasome_lof_plot.png")
mm909_proteasome_lof_plot <- plot_top5(response_df, fora_results_proteasome_lof_df, mm909_proteasome_lof_plot_path, "FORA with top PROTEASOME GS across samples (LoF mutations)", "Top PROTEASOME Gene Sets")
print(mm909_proteasome_lof_plot)
Filter specifically for KEGG_PROTEASOME gene set.
# Define output path
mm909_keggproteasome_lof_plot_path <- file.path(figures_ubgsea_path,
"mm909_proteasome_lof_plot.png")
geneset = c("KEGG_PROTEASOME")
mm909_keggproteasome_lof_plot <- plot_geneset(geneset, response_df, fora_results_proteasome_lof_df, mm909_keggproteasome_lof_plot_path, "FORA with KEGG_PROTEASOME GS across samples (LoF mutations)", "KEGG_PROTEASOME Gene Set")
print(mm909_keggproteasome_lof_plot)
I queried MSigDB with 45 interesting Gene Sets that I extracted from the previous exploratory analysis. For some reason, I could not find the GRAESSMANN_APOPTOSIS_BY_DOXORUBICIN_DN and YOSHIMURA_MAPK8_TARGETS_UP for Human when doing this, although I got them when I did the C2:CGP analysis (I always use human in the function). So, I ended up with 43. Here I only present plots containing LoF mutations.
# Define paths
fora_results_all_csv_path <- file.path(results_ubgsea_path, "fora_results_combinedGS_all.csv")
fora_results_nonsyn_csv_path <- file.path(results_ubgsea_path, "fora_results_combinedGS_nonsyn.csv")
fora_results_lof_csv_path <- file.path(results_ubgsea_path, "fora_results_combinedGS_lof.csv")
# Load GMT file as a GeneSetCollection object
gene_sets_collection <- getGmt(file.path(data_path, "genesets_combined.v2023.2.Hs.gmt"))
# Initialize an empty list to store the results
gene_sets_list <- list()
# Iterate over each GeneSet in the GeneSetCollection
for (setName in names(gene_sets_collection)) {
# Extract the GeneSet by its name
geneSet <- gene_sets_collection[[setName]]
# Convert Hugo Symbols to Ensembl IDs for this GeneSet
ensembl_ids <- SYMBOLtoENSEMBL(geneIds(geneSet))
# Ensure the result is a character vector of Ensembl Gene IDs
ensembl_ids_vector <- as.character(ensembl_ids)
# Store the converted IDs in the list, associated with the setName
gene_sets_list[[setName]] <- ensembl_ids_vector
}
# # Perform FORA analysis for combined Gene Sets
# perform_fora_analysis(sample_id_list, list_of_all_ub_mutations, list_of_nonsyn_ub_mutations, list_of_lof_ub_mutations, fora_results_all_csv_path, fora_results_nonsyn_csv_path, fora_results_lof_csv_path, gene_sets_list=gene_sets_list)
# Load fora results
fora_results_combinedGS_all_df <- read.csv(fora_results_all_csv_path, header = TRUE)[ , -1]
fora_results_combinedGS_nonsyn_df <- read.csv(fora_results_nonsyn_csv_path, header = TRUE)[ , -1]
fora_results_combinedGS_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
# Get all unique sample_id values
unique_sample_ids <- unique(fora_results_combinedGS_lof_df$sample_id)
This does not have the filter p.adjust < 0.05.
# Loop through each sample_id
for(sample_id in unique_sample_ids) {
# Define the output plot path dynamically based on the current sample_id
plot_path <- file.path(figures_ubgsea_path, paste0(sample_id, "_combinedGS_lof_barplot.png"))
# Get response group
response <- response_df %>%
dplyr::filter(sample_id == !!sample_id) %>%
dplyr::select(patient_response)
response <- as.character(response)
# Generate the plot title dynamically
plot_title <- paste("FORA with combined GS for sample", sample_id, "-", response, "(LoF mutations)")
# Call the plotting function with the current sample_id
combinedGS_lof_plot <- plot_combinedGS(sample_id, response_df, fora_results_combinedGS_lof_df, plot_path, plot_title, 15, 15)
# Print
print(combinedGS_lof_plot)
}
This does not have the filter p.adjust < 0.05.
# Loop through each geneset
for(geneset in names(gene_sets_list)) {
# Define the output plot path dynamically based on the current geneset
plot_path <- file.path(figures_ubgsea_path, paste0(geneset, "_combinedGS_lof_bubbleplot.png"))
# Generate the plot title dynamically
plot_title <- paste("FORA with", geneset, "GS per sample stratified by response (LoF mutations)")
y_label <- NULL
# Use tryCatch when calling the plotting function
tryCatch({
# Attempt to generate and print the plot
combinedGS_lof_plot <- plot_geneset(geneset, response_df, fora_results_combinedGS_lof_df, plot_path, plot_title, y_label)
print(combinedGS_lof_plot)
}, error = function(e) {
# If an error occurs, print the error message
cat("An error occurred with geneset", geneset, ": ", e$message, "\n")
})
}
## An error occurred with geneset BORCZUK_MALIGNANT_MESOTHELIOMA_DN : `breaks` and `labels` have different lengths.
## An error occurred with geneset CONCANNON_APOPTOSIS_BY_EPOXOMICIN_DN : `breaks` and `labels` have different lengths.
## An error occurred with geneset CONCANNON_APOPTOSIS_BY_EPOXOMICIN_UP : `breaks` and `labels` have different lengths.
## An error occurred with geneset GOCC_PROTEASOME_ACCESSORY_COMPLEX : `breaks` and `labels` have different lengths.
## An error occurred with geneset GOCC_PROTEASOME_REGULATORY_PARTICLE : `breaks` and `labels` have different lengths.
## An error occurred with geneset HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION : `breaks` and `labels` have different lengths.
## An error occurred with geneset HALLMARK_MITOTIC_SPINDLE : `breaks` and `labels` have different lengths.
## An error occurred with geneset HALLMARK_MYOGENESIS : `breaks` and `labels` have different lengths.
## An error occurred with geneset KEGG_PROTEASOME : `breaks` and `labels` have different lengths.
## An error occurred with geneset SAKAI_TUMOR_INFILTRATING_MONOCYTES_DN : `breaks` and `labels` have different lengths.
## An error occurred with geneset SAKAI_TUMOR_INFILTRATING_MONOCYTES_UP : `breaks` and `labels` have different lengths.
This does not have the filter p.adjust < 0.05.
# Calculate the sequence of indices in steps of 10
indices <- seq(from = 1, to = length(gene_sets_list), by = 10)
# Loop through the indices
global_idx <- 1
for (i in indices) {
# Select a group of 10 gene sets or the remaining ones if less than 10
geneset_group <- names(gene_sets_list)[i:min(i+9, length(gene_sets_list))]
# Define the output plot path dynamically based on the current index
plot_path <- file.path(figures_ubgsea_path, paste0(global_idx, "_combinedGS_lof_boxplot.png"))
# Call the plotting function with the current geneset group
combinedGS_lof_plot <- boxplot_genesets(geneset_group, response_df, fora_results_combinedGS_lof_df, plot_path,
paste("FORA with GS", i, "to", min(i+9, length(gene_sets_list)),
"stratified by response (LoF mutations)"), 15)
# Print
print(combinedGS_lof_plot)
# Update global index
global_idx = global_idx + 1
}
This does not have the filter p.adjust < 0.05.
plot_path <- file.path(figures_ubgsea_path, "mm909_combinedGS_lof_heat_map.png")
combinedGS_lof_plot <- heat_map_genesets(response_df, fora_results_combinedGS_lof_df, plot_path,
"Heat Map of combined Gene Sets per patient")
# Print
print(combinedGS_lof_plot)
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Copenhagen
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] GO.db_3.18.0 clusterProfiler_4.10.0
## [3] EnsDb.Hsapiens.v75_2.99.0 ensembldb_2.26.0
## [5] AnnotationFilter_1.26.0 GenomicFeatures_1.54.3
## [7] GenomicRanges_1.54.1 GenomeInfoDb_1.38.6
## [9] fgsea_1.28.0 msigdbr_7.5.1
## [11] GSEABase_1.64.0 graph_1.80.0
## [13] annotate_1.80.0 XML_3.99-0.16.1
## [15] AnnotationDbi_1.64.1 IRanges_2.36.0
## [17] S4Vectors_0.40.2 Biobase_2.62.0
## [19] BiocGenerics_0.48.1 jsonlite_1.8.8
## [21] httr_1.4.7 DT_0.31
## [23] knitr_1.45 lubridate_1.9.3
## [25] forcats_1.0.0 stringr_1.5.1
## [27] dplyr_1.1.4 purrr_1.0.2
## [29] readr_2.1.5 tidyr_1.3.1
## [31] tibble_3.2.1 ggplot2_3.5.0
## [33] tidyverse_2.0.0 readxl_1.4.3
##
## loaded via a namespace (and not attached):
## [1] later_1.3.2 splines_4.3.2
## [3] BiocIO_1.12.0 bitops_1.0-7
## [5] ggplotify_0.1.2 filelock_1.0.3
## [7] cellranger_1.1.0 polyclip_1.10-6
## [9] lifecycle_1.0.4 lattice_0.22-5
## [11] MASS_7.3-60.0.1 magrittr_2.0.3
## [13] sass_0.4.8 rmarkdown_2.25
## [15] jquerylib_0.1.4 yaml_2.3.8
## [17] httpuv_1.6.14 cowplot_1.1.3
## [19] DBI_1.2.1 RColorBrewer_1.1-3
## [21] abind_1.4-5 zlibbioc_1.48.0
## [23] ggraph_2.2.1 RCurl_1.98-1.14
## [25] yulab.utils_0.1.4 tweenr_2.0.3
## [27] rappdirs_0.3.3 GenomeInfoDbData_1.2.11
## [29] enrichplot_1.22.0 ggrepel_0.9.5
## [31] tidytree_0.4.6 codetools_0.2-19
## [33] DelayedArray_0.28.0 DOSE_3.28.2
## [35] xml2_1.3.6 ggforce_0.4.2
## [37] tidyselect_1.2.0 aplot_0.2.2
## [39] farver_2.1.1 viridis_0.6.5
## [41] matrixStats_1.2.0 BiocFileCache_2.10.1
## [43] GenomicAlignments_1.38.2 ellipsis_0.3.2
## [45] tidygraph_1.3.1 systemfonts_1.0.5
## [47] tools_4.3.2 progress_1.2.3
## [49] ragg_1.2.7 treeio_1.26.0
## [51] Rcpp_1.0.12 glue_1.7.0
## [53] gridExtra_2.3 SparseArray_1.2.4
## [55] xfun_0.42 qvalue_2.34.0
## [57] MatrixGenerics_1.14.0 withr_3.0.0
## [59] BiocManager_1.30.22 fastmap_1.1.1
## [61] fansi_1.0.6 digest_0.6.34
## [63] mime_0.12 timechange_0.3.0
## [65] R6_2.5.1 gridGraphics_0.5-1
## [67] textshaping_0.3.7 colorspace_2.1-0
## [69] biomaRt_2.58.2 RSQLite_2.3.5
## [71] utf8_1.2.4 generics_0.1.3
## [73] data.table_1.15.0 rtracklayer_1.62.0
## [75] prettyunits_1.2.0 graphlayouts_1.1.0
## [77] htmlwidgets_1.6.4 S4Arrays_1.2.0
## [79] scatterpie_0.2.1 pkgconfig_2.0.3
## [81] gtable_0.3.4 blob_1.2.4
## [83] XVector_0.42.0 shadowtext_0.1.3
## [85] htmltools_0.5.7 ProtGenerics_1.34.0
## [87] scales_1.3.0 png_0.1-8
## [89] ggfun_0.1.4 rstudioapi_0.15.0
## [91] tzdb_0.4.0 reshape2_1.4.4
## [93] rjson_0.2.21 nlme_3.1-164
## [95] curl_5.2.0 cachem_1.0.8
## [97] BiocVersion_3.18.1 parallel_4.3.2
## [99] HDO.db_0.99.1 restfulr_0.0.15
## [101] pillar_1.9.0 grid_4.3.2
## [103] vctrs_0.6.5 promises_1.2.1
## [105] dbplyr_2.4.0 xtable_1.8-4
## [107] evaluate_0.23 cli_3.6.2
## [109] compiler_4.3.2 Rsamtools_2.18.0
## [111] rlang_1.1.3 crayon_1.5.2
## [113] labeling_0.4.3 plyr_1.8.9
## [115] fs_1.6.3 stringi_1.8.3
## [117] viridisLite_0.4.2 BiocParallel_1.36.0
## [119] babelgene_22.9 munsell_0.5.0
## [121] Biostrings_2.70.2 lazyeval_0.2.2
## [123] GOSemSim_2.28.1 Matrix_1.6-5
## [125] hms_1.1.3 patchwork_1.2.0
## [127] bit64_4.0.5 shiny_1.8.0
## [129] KEGGREST_1.42.0 highr_0.10
## [131] interactiveDisplayBase_1.40.0 AnnotationHub_3.10.0
## [133] SummarizedExperiment_1.32.0 igraph_2.0.2
## [135] memoise_2.0.1 bslib_0.6.1
## [137] ggtree_3.10.1 fastmatch_1.1-4
## [139] bit_4.0.5 gson_0.1.0
## [141] ape_5.7-1